library(tidyverse)
## -- Attaching packages ------------------------ tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.8
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts --------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(stringr)
library(readr)
library(readxl)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(maps)
library(gsheet)
Read original data
library(gsheet)
heart_disease_stratify = gsheet2tbl('docs.google.com/spreadsheets/d/1W-xVHLoeZj37wrZdX-dP_SVouZKg5VmTpFdmOLWByR4/edit?usp=sharing') %>%
janitor::clean_names() %>%
rename(state = location_abbr) %>%
rename(mortality_rate = data_value) %>%
mutate(state = state.name[match(state, state.abb)]) %>%
select(-data_source, -geographic_level, -class, -topic, -data_value_footnote, -data_value_footnote_symbol, -topic_id, -location_id )
heart_disease = heart_disease_stratify %>%
filter(stratification1 == "Overall", stratification2 == "Overall") %>%
select(-stratification1, -stratification2, -stratification_category1, -stratification_category2)
heart_disease$mortality_rate[is.na(heart_disease$mortality_rate)] = 0
heart_disease =
heart_disease %>%
group_by(state) %>%
summarise(mortality_rate = mean(mortality_rate))
Add air quality data
airquality_2015 = read_csv("data/airquality.csv") %>%
janitor::clean_names() %>%
select(state, pm2_5) %>%
group_by(state) %>%
summarize(pm2.5 = sum(pm2_5))
## Parsed with column specification:
## cols(
## State = col_character(),
## County = col_character(),
## Year = col_integer(),
## `Days with AQI` = col_integer(),
## `Good Days` = col_integer(),
## `Moderate Days` = col_integer(),
## `Unhealthy for Sensitive Groups Days` = col_integer(),
## `Unhealthy Days` = col_integer(),
## `Very Unhealthy Days` = col_integer(),
## `Hazardous Days` = col_integer(),
## `Max AQI` = col_integer(),
## `90th Percentile AQI` = col_integer(),
## `Median AQI` = col_integer(),
## `Days CO` = col_integer(),
## `Days NO2` = col_integer(),
## `Days Ozone` = col_integer(),
## `Days SO2` = col_integer(),
## PM2.5 = col_integer(),
## `Days PM10` = col_integer()
## )
Add obesity data
obesity_data = read_csv("data/National_Obesity_By_State.csv") %>%
janitor::clean_names() %>%
rename(state = name) %>%
rename(obesity_percentage = obesity) %>%
select(state, obesity_percentage)
## Parsed with column specification:
## cols(
## OBJECTID = col_integer(),
## NAME = col_character(),
## Obesity = col_double(),
## Shape__Area = col_double(),
## Shape__Length = col_double()
## )
data_with_obesity = left_join(heart_disease, obesity_data)
## Joining, by = "state"
Add stroke data
stroke_data = read_csv("data/Stroke_Mortality_Data_Among_US_Adults__35___by_State_Territory_and_County.csv") %>%
janitor::clean_names() %>%
rename(stroke_value=data_value)%>%
rename(state = location_abbr) %>%
mutate(state = state.name[match(state, state.abb)])%>%
select(state,stroke_value) %>%
group_by(state) %>%
filter(!is.na(stroke_value)) %>%
summarize(stroke_value = sum(stroke_value))
## Parsed with column specification:
## cols(
## Year = col_integer(),
## LocationAbbr = col_character(),
## LocationDesc = col_character(),
## GeographicLevel = col_character(),
## DataSource = col_character(),
## Class = col_character(),
## Topic = col_character(),
## Data_Value = col_double(),
## Data_Value_Unit = col_character(),
## Data_Value_Type = col_character(),
## Data_Value_Footnote_Symbol = col_character(),
## Data_Value_Footnote = col_character(),
## StratificationCategory1 = col_character(),
## Stratification1 = col_character(),
## StratificationCategory2 = col_character(),
## Stratification2 = col_character(),
## TopicID = col_character(),
## LocationID = col_character(),
## `Location 1` = col_character()
## )
Add income
income_data = read_excel("data/income_2015.xlsx", range = "A4:D55") %>%
janitor::clean_names() %>%
rename(state = united_states, median_income = x55117, income_standard_error = x253)
data_with_income = left_join(heart_disease,income_data, by = "state")
data_income_obesity = left_join(income_data,data_with_obesity, by = "state")
smoking_data = gsheet2tbl('https://docs.google.com/spreadsheets/d/1ZU7uuqV-EZM81hE4kq0nFiPJHUMJj82fnXVllGFHJ00/edit?usp=sharing') %>%
mutate(year = 2015) %>%
rename(state = LocationDesc) %>%
select(-YEAR) %>%
filter(!is.na(Data_Value)) %>%
select(year, state, Data_Value) %>%
select(year, state, Data_Value) %>%
rename(tobacco_comsumption = Data_Value) %>%
group_by(state) %>%
summarise(tobacco_consumption = sum(tobacco_comsumption))
data_income_obesity_smoking = left_join(smoking_data, data_income_obesity, by = "state")
data_income_obesity_smoking_air = left_join(airquality_2015, data_income_obesity_smoking, by = "state")
data_income_obesity_smoking = left_join(smoking_data, data_income_obesity, by = "state")
data_income_obesity_smoking_air = left_join(airquality_2015, data_income_obesity_smoking, by = "state")
final_data_export = left_join(stroke_data, data_income_obesity_smoking_air, by = "state")
write.csv(final_data_export, file = "data/final_data.csv")
Find the association between smoke and heart disease mortality
final_data = read_csv("data/final_data.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## state = col_character(),
## stroke_value = col_double(),
## pm2.5 = col_integer(),
## tobacco_consumption = col_double(),
## us = col_character(),
## median_income = col_integer(),
## income_standard_error = col_integer(),
## mortality_rate = col_double(),
## obesity_percentage = col_double()
## )
final_data %>%
mutate(state = forcats::fct_reorder(factor(state), tobacco_consumption)) %>%
ggplot(aes(x = mortality_rate, y = tobacco_consumption)) +
geom_point(aes(color = state), alpha = .5) +
stat_smooth(method = "lm", col = "red") +
labs(
title = "Tabacco Consumption Accross states"
) +
theme(text = element_text(size = 8), axis.text = element_text(angle = 60, hjust = 1), legend.position = "bottom")
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
lm(mortality_rate~tobacco_consumption, data = final_data) %>%
summary()
##
## Call:
## lm(formula = mortality_rate ~ tobacco_consumption, data = final_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -124.595 -36.583 3.269 35.753 104.944
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -208.91946 121.60835 -1.718 0.0922 .
## tobacco_consumption 0.05017 0.01108 4.526 3.96e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52.21 on 48 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2992, Adjusted R-squared: 0.2846
## F-statistic: 20.49 on 1 and 48 DF, p-value: 3.959e-05
Find the association between obesity and heart disease mortality
final_data %>%
ggplot(aes(x = obesity_percentage, y = mortality_rate)) +
geom_point(aes(color = state), alpha = .5) +
stat_smooth(method = "lm", col = "red") +
labs(
x = "Obesity(%)",
y = "Mortality",
title = "Heart Disease Mortality vs Obesity"
) +
theme(text = element_text(size = 8),
axis.text = element_text(angle = 60, hjust = 1),
legend.position = "bottom",
plot.title = element_text(hjust = 0.5))
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
lm(mortality_rate ~ obesity_percentage, data = final_data) %>%
broom::tidy()
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -17.8 46.9 -0.380 7.06e- 1
## 2 obesity_percentage 12.2 1.58 7.71 6.09e-10
Find the association between income and heart disease mortality
final_data_income =
final_data %>%
mutate(state = forcats::fct_reorder(factor(state), median_income))
final_data_income %>%
ggplot(aes(x = mortality_rate, y = median_income, color = state)) +
geom_point() +
stat_smooth(method = "lm", col = "red") +
theme(text = element_text(size = 8), axis.text.x = element_text(angle = 60, hjust = 1), legend.position = "bottom") +
#Add the title and the name for x and y axis.
labs(
title = "Association between Income and Heart Disease Mortality Rate",
x = "Mortality Rate",
y = "median_income"
)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
lm(mortality_rate ~ median_income, data = final_data_income) %>%
summary()
##
## Call:
## lm(formula = mortality_rate ~ median_income, data = final_data_income)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.924 -29.753 -7.434 28.817 102.081
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.102e+02 3.994e+01 15.279 < 2e-16 ***
## median_income -4.826e-03 7.058e-04 -6.838 1.3e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.39 on 48 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.4935, Adjusted R-squared: 0.4829
## F-statistic: 46.76 on 1 and 48 DF, p-value: 1.303e-08
From the lm result, we can observe that median_income is a very significant variable with a p value of 1.3e-08. This indicates there is a strong association between income and heart disease mortality rate
Find the association between airquality and heart disease mortality
## make scatterplot
final_data %>%
mutate(state = fct_reorder(state, mortality_rate)) %>%
ggplot(aes(x = mortality_rate, y = pm2.5, color = state)) +
geom_point() +
ggtitle("Airquality VS Mortality Rate ") +
stat_smooth(method = "lm", col = "red") +
theme(legend.position = "bottom",
legend.direction = "horizontal",
axis.text.x = element_text(angle = 90, size = 6),
legend.key.size = unit(0.05, "cm")) +
labs(x = "Mortality Rate",
y = "PM2.5")
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## fit simple linear regression model
air_regression<-lm(final_data$mortality_rate~final_data$pm2.5)
summary(air_regression)
##
## Call:
## lm(formula = final_data$mortality_rate ~ final_data$pm2.5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89.08 -38.02 -15.49 34.66 139.55
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.382e+02 1.427e+01 23.698 <2e-16 ***
## final_data$pm2.5 9.734e-04 4.673e-03 0.208 0.836
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 62.34 on 48 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.0009031, Adjusted R-squared: -0.01991
## F-statistic: 0.04339 on 1 and 48 DF, p-value: 0.8359
From the scatterplot, we can see that the points are spread randomlly. However, the relationship between pm2.5 and mortality rate is unclear. For the states, with low pm2.5, some of them have low mortality rate and some of them have high mortality rate. After we fit the simple regression model, the p-value for pm2.5 is 0.836, so it is a non-significant variable.
Find the association between average stroke value and average heart disease mortality, and SLR for all counties using wtfdata
mydata1=heart_disease_stratify%>%
filter(stratification1 == "Overall", stratification2 == "Overall")%>%
select(state, location_desc, mortality_rate)%>%
filter(!is.na(mortality_rate))
mydata2=read_csv("data/Stroke_Mortality_Data_Among_US_Adults__35___by_State_Territory_and_County.csv") %>%
janitor::clean_names() %>%
rename(state = location_abbr) %>%
rename(stroke_value = data_value) %>%
mutate(state = state.name[match(state, state.abb)])%>%
filter(stratification1 == "Overall", stratification2 == "Overall")%>%
select(state, location_desc, stroke_value)%>%
filter(!is.na(stroke_value))
## Parsed with column specification:
## cols(
## Year = col_integer(),
## LocationAbbr = col_character(),
## LocationDesc = col_character(),
## GeographicLevel = col_character(),
## DataSource = col_character(),
## Class = col_character(),
## Topic = col_character(),
## Data_Value = col_double(),
## Data_Value_Unit = col_character(),
## Data_Value_Type = col_character(),
## Data_Value_Footnote_Symbol = col_character(),
## Data_Value_Footnote = col_character(),
## StratificationCategory1 = col_character(),
## Stratification1 = col_character(),
## StratificationCategory2 = col_character(),
## Stratification2 = col_character(),
## TopicID = col_character(),
## LocationID = col_character(),
## `Location 1` = col_character()
## )
comdata = left_join(mydata1,mydata2)
## Joining, by = c("state", "location_desc")
final_data %>%
mutate(state = forcats::fct_reorder(state,stroke_value)) %>%
ggplot(aes(x = mortality_rate, y = stroke_value)) +
geom_point(aes(color = state), alpha = .5)
## Warning: Removed 1 rows containing missing values (geom_point).
comdata %>%
mutate(state = forcats::fct_reorder(state,stroke_value)) %>%
ggplot(aes(x = mortality_rate, y = stroke_value)) +
geom_point(aes(color = state), alpha = .5)
## Warning: Removed 2 rows containing missing values (geom_point).
stroke_slm=lm(mortality_rate~stroke_value,data=comdata)
summary(stroke_slm)
##
## Call:
## lm(formula = mortality_rate ~ stroke_value, data = comdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -374.77 -48.06 -7.98 37.16 955.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 143.23631 5.98863 23.92 <2e-16 ***
## stroke_value 2.74650 0.07525 36.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75.07 on 3269 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.2895, Adjusted R-squared: 0.2893
## F-statistic: 1332 on 1 and 3269 DF, p-value: < 2.2e-16
find association between heart disease, stroke based, gender and race
new_heart_gender_data=read_csv("data/Heart_Disease_Mortality_Data_Among_US_Adults__35___by_State_Territory_and_County.csv") %>%
janitor::clean_names() %>%
rename(state = location_abbr) %>%
rename(mortality_rate = data_value) %>%
rename(gender=stratification1)%>%
rename(race=stratification2)%>%
mutate(state = state.name[match(state, state.abb)])%>%
filter(gender%in%c("Male","Female"))%>%
filter(race=="Overall")%>%
select(state,location_desc, mortality_rate,gender)
## Parsed with column specification:
## cols(
## Year = col_integer(),
## LocationAbbr = col_character(),
## LocationDesc = col_character(),
## GeographicLevel = col_character(),
## DataSource = col_character(),
## Class = col_character(),
## Topic = col_character(),
## Data_Value = col_double(),
## Data_Value_Unit = col_character(),
## Data_Value_Type = col_character(),
## Data_Value_Footnote_Symbol = col_character(),
## Data_Value_Footnote = col_character(),
## StratificationCategory1 = col_character(),
## Stratification1 = col_character(),
## StratificationCategory2 = col_character(),
## Stratification2 = col_character(),
## TopicID = col_character(),
## LocationID = col_character(),
## `Location 1` = col_character()
## )
new_stroke_gender_data=read_csv("data/Stroke_Mortality_Data_Among_US_Adults__35___by_State_Territory_and_County.csv") %>%
janitor::clean_names() %>%
rename(state = location_abbr) %>%
rename(stroke_value = data_value) %>%
rename(gender=stratification1)%>%
rename(race=stratification2)%>%
mutate(state = state.name[match(state, state.abb)])%>%
filter(gender%in%c("Male","Female"))%>%
filter(race=="Overall")%>%
select(state, location_desc, stroke_value,gender)
## Parsed with column specification:
## cols(
## Year = col_integer(),
## LocationAbbr = col_character(),
## LocationDesc = col_character(),
## GeographicLevel = col_character(),
## DataSource = col_character(),
## Class = col_character(),
## Topic = col_character(),
## Data_Value = col_double(),
## Data_Value_Unit = col_character(),
## Data_Value_Type = col_character(),
## Data_Value_Footnote_Symbol = col_character(),
## Data_Value_Footnote = col_character(),
## StratificationCategory1 = col_character(),
## Stratification1 = col_character(),
## StratificationCategory2 = col_character(),
## Stratification2 = col_character(),
## TopicID = col_character(),
## LocationID = col_character(),
## `Location 1` = col_character()
## )
new_gender_data=left_join(new_heart_gender_data,new_stroke_gender_data)%>%
filter(!is.na(stroke_value))
## Joining, by = c("state", "location_desc", "gender")
gender_reg=lm(mortality_rate~stroke_value+gender+stroke_value*gender,data=new_gender_data)
summary(gender_reg)
##
## Call:
## lm(formula = mortality_rate ~ stroke_value + gender + stroke_value *
## gender, data = new_gender_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -485.17 -48.03 -7.52 38.73 1158.15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 112.49516 6.27099 17.939 < 2e-16 ***
## stroke_value 2.21351 0.08028 27.573 < 2e-16 ***
## genderMale 111.99707 8.56167 13.081 < 2e-16 ***
## stroke_value:genderMale 0.61112 0.10802 5.657 1.6e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78.36 on 6466 degrees of freedom
## Multiple R-squared: 0.5931, Adjusted R-squared: 0.5929
## F-statistic: 3141 on 3 and 6466 DF, p-value: < 2.2e-16
ggplot(new_gender_data,aes(x=mortality_rate,y=stroke_value))+
geom_point(aes(color=gender), alpha = .5)
new_heart_race_data=read_csv("data/Heart_Disease_Mortality_Data_Among_US_Adults__35___by_State_Territory_and_County.csv") %>%
janitor::clean_names() %>%
rename(state = location_abbr) %>%
rename(mortality_rate = data_value) %>%
rename(gender=stratification1)%>%
rename(race=stratification2)%>%
mutate(state = state.name[match(state, state.abb)])%>%
filter(gender=="Overall")%>%
filter(!(race=="Overall"))%>%
select(state, location_desc, mortality_rate, race)
## Parsed with column specification:
## cols(
## Year = col_integer(),
## LocationAbbr = col_character(),
## LocationDesc = col_character(),
## GeographicLevel = col_character(),
## DataSource = col_character(),
## Class = col_character(),
## Topic = col_character(),
## Data_Value = col_double(),
## Data_Value_Unit = col_character(),
## Data_Value_Type = col_character(),
## Data_Value_Footnote_Symbol = col_character(),
## Data_Value_Footnote = col_character(),
## StratificationCategory1 = col_character(),
## Stratification1 = col_character(),
## StratificationCategory2 = col_character(),
## Stratification2 = col_character(),
## TopicID = col_character(),
## LocationID = col_character(),
## `Location 1` = col_character()
## )
new_stroke_race_data=read_csv("data/Stroke_Mortality_Data_Among_US_Adults__35___by_State_Territory_and_County.csv") %>%
janitor::clean_names() %>%
rename(state = location_abbr) %>%
rename(stroke_value = data_value) %>%
rename(gender=stratification1)%>%
rename(race=stratification2)%>%
mutate(state = state.name[match(state, state.abb)])%>%
filter(gender=="Overall")%>%
filter(!(race=="Overall"))%>%
select(state, location_desc, stroke_value,race)
## Parsed with column specification:
## cols(
## Year = col_integer(),
## LocationAbbr = col_character(),
## LocationDesc = col_character(),
## GeographicLevel = col_character(),
## DataSource = col_character(),
## Class = col_character(),
## Topic = col_character(),
## Data_Value = col_double(),
## Data_Value_Unit = col_character(),
## Data_Value_Type = col_character(),
## Data_Value_Footnote_Symbol = col_character(),
## Data_Value_Footnote = col_character(),
## StratificationCategory1 = col_character(),
## Stratification1 = col_character(),
## StratificationCategory2 = col_character(),
## Stratification2 = col_character(),
## TopicID = col_character(),
## LocationID = col_character(),
## `Location 1` = col_character()
## )
new_race_data=left_join(new_heart_race_data,new_stroke_race_data)%>%
filter(!is.na(stroke_value))
## Joining, by = c("state", "location_desc", "race")
race_reg=lm(mortality_rate~stroke_value+race+stroke_value*race,data=new_race_data)
summary(race_reg)
##
## Call:
## lm(formula = mortality_rate ~ stroke_value + race + stroke_value *
## race, data = new_race_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -430.83 -48.04 -7.57 40.13 1110.23
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 175.9394 13.1181 13.412
## stroke_value 2.5703 0.1566 16.414
## raceAsian and Pacific Islander -50.8487 15.7666 -3.225
## raceBlack 95.0941 15.4000 6.175
## raceHispanic -62.8309 14.8478 -4.232
## raceWhite -23.2946 15.0278 -1.550
## stroke_value:raceAsian and Pacific Islander -2.0687 0.2011 -10.285
## stroke_value:raceBlack -1.1864 0.1722 -6.892
## stroke_value:raceHispanic -1.1271 0.1901 -5.929
## stroke_value:raceWhite 0.1906 0.1830 1.041
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## stroke_value < 2e-16 ***
## raceAsian and Pacific Islander 0.00127 **
## raceBlack 7.01e-10 ***
## raceHispanic 2.35e-05 ***
## raceWhite 0.12116
## stroke_value:raceAsian and Pacific Islander < 2e-16 ***
## stroke_value:raceBlack 6.02e-12 ***
## stroke_value:raceHispanic 3.21e-09 ***
## stroke_value:raceWhite 0.29775
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 81.27 on 6670 degrees of freedom
## (13 observations deleted due to missingness)
## Multiple R-squared: 0.595, Adjusted R-squared: 0.5945
## F-statistic: 1089 on 9 and 6670 DF, p-value: < 2.2e-16
ggplot(new_race_data, aes(x=mortality_rate,y=stroke_value))+
geom_point(aes(color=race))
## Warning: Removed 13 rows containing missing values (geom_point).
* From the interaction, we find that the p-value of “stroke_value : gender” is very small, indicate “gender” modifies the relationship between geart disease mortality and stroke mortality. With the plot, we can find that males have higher heart disease mortality rate, but the stroke mortality rate is similar. * From the linear regression model, we find that the Black has the highest heart disease mortality rate, and the Hispanic the least. ???????
library(plotly)
map_data = final_data %>%
mutate(state = tolower(state))
states <- map_data("state") %>%
rename(state = region)
a = left_join(states, map_data, by = "state") %>%
mutate(text_label = str_c("Region: ", state, 'Mortality rate: ', mortality_rate) )
a$text_label <- with(a, paste(state, '<br>', "Mortality_rate", mortality_rate))
# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
p <- plot_geo(a, locationmode = 'USA-states') %>%
add_trace(
z = ~mortality_rate, text = ~text_label, locations = ~us,
color = ~mortality_rate, colors = 'Purples'
) %>%
colorbar(title = "Millions USD") %>%
layout(
title = '2011 US Agriculture Exports by State<br>(Hover for breakdown)',
geo = g
)
## Warning: Ignoring 10 observations
p
final_data = final_data %>%
janitor::clean_names()
total_lm = lm(mortality_rate ~ tobacco_consumption + median_income + obesity_percentage + pm2_5 + stroke_value, data = final_data) %>%
fortify()
ggplot(total_lm, aes(x = .fitted, y = .resid)) + geom_point() +
stat_smooth(method = "lm", col = "red") +
labs(
x = "Fitted value",
y = "Residual",
title = "Residuals vs Fitted"
) +
theme(plot.title = element_text(hjust = 0.5))